library(Matrix)
library(MASS)
library(fda)
library(lme4)
library(pedigreemm)
library(mvnfast)
library(ggplot2)
library(plotly)
\[ \bf{Y} = \bf{Z^G \alpha} + \bf{Z^E \gamma} + \bf{\epsilon}\] with the following distribution of the random vectors: \[\begin{align*} \alpha &\sim N(\bf{0}, \bf{A \otimes C^G})\\ \gamma &\sim N(\bf{0}, \bf{I_N \otimes C^G})\\ \epsilon & \sim N(\bf{0}, \sigma^2_{res} *\bf{I_n}) \end{align*}\] assuming there are \(N\) individuals and total \(n\) measurements.
For the simulation study, we will fix a basis \(\phi(t)\) for our functional space and true covariance matrices \(\bf{C^G}\) and \(\bf{C^E}\). Therefore the true genetic and environmental covariance functions which will be estimated are: \[\begin{align*} G(t,s) & = \phi(t)^T \ast \bf{C^G} \ast \phi(t)\\ E(t,s) & = \phi(t)^T \ast \bf{C^E} \ast \phi(t) \end{align*}\] Then we will generate a set of responses and fit the mixed-effect model to get the estimated covariance functions, which then will be compared with the true ones.
Step 1: Fix functional basis, covariance matrices and residual variance. Here we will use 5 cubic B-spline basis.
### use b-spline basis
basisObj <- create.bspline.basis(c(0,1), nbasis = 5, norder = 4)
### genetic covariance matrix
C_gen <- matrix(c(750, 10 ,130, 80, 250,
10, 800, 30, 15, 40,
130, 30, 700, 50, 130,
80, 15, 50, 420, 50,
250, 40, 130, 50, 330), nrow = 5, byrow = T)
### environmental covariance matrix
C_env <- matrix(c(750, 10 ,130, 80, 250,
10, 800, 30, 15, 40,
130, 30, 700, 50, 130,
80, 15, 50, 420, 50,
250, 40, 130, 50, 330), nrow = 5, byrow = T)
### residual variance
sigma2 <- 50
Step 2: Set the distribution of the random effect vectors \(\bf{\alpha}\) and \(\bf{\gamma}\).
Remark: we will assume in total there are 873 individuals so the same genetic relationship matrix \(A\) computed in developing our mixed-effect model will be used in our simulation study. For now, assume each individual has 10 regular sampling points from the unit interval. We will use 3 (or 4) principal components to re-construct the covariance functions.
### reparameterised genetic covariance
C_gen_para <- as(kronecker(A, C_gen), "dgCMatrix")
### reparameterised environmental covariance
I_N <- as(diag(873), "dgCMatrix")
C_env_para <- as(kronecker(I_N, C_env), "dgCMatrix")
### distribution of the genetic random vector
mu <- rep(0, dim(C_gen_para)[1])
alpha <- rmvn(n=50, mu = mu, sigma = C_gen_para) # here we generate 50 random vectors from multivariate normal distribution
gamma <- rmvn(n=50, mu = mu, sigma = C_env_para)
### distribution of the error vector
mu_res <- rep(0, 8730)
I_n <- as(diag(8730), "dgCMatrix")
res_cov <- as(sigma2 * I_n, "dgCMatrix")
res <- rmvn(n=50, mu=mu_res, sigma = res_cov) # error vector
Step 3: Use the modular functions of lme4() to compute
the random effect design matrices.
uniqueIds <- seq(1,873, length=873)
id <- rep(uniqueIds, each=10)
time_rang <- seq(0,1,length=10)
basis <- eval.basis(time_rang, basisObj)
b1 <- rep(basis[,1], times = 873)
b2 <- rep(basis[,2], times = 873)
b3 <- rep(basis[,3], times = 873)
b4 <- rep(basis[,4], times = 873)
b5 <- rep(basis[,5], times = 873)
y_pre <- rep(0, 8730) # dummy response values, will be updated
df_pre <- data.frame(id = id, y_pre = y_pre, b1 = b1, b2 = b2, b3 = b3, b4 = b4, b5 = b5)
parsedFormula <- y_pre ~ (-1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5 | df_pre$id) +
(-1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5 | df_pre$id)
Z_pre <- t(lFormula(formula=parsedFormula, data=df_pre)$reTrms$Zt)
### update the genetic covariance matrix
L <- as(t(chol(A)), "dgCMatrix")
I5 <- as(diag(5), "dgCMatrix")
M <- kronecker(L, I5)
ZE <- Z_pre[,1:4365]
ZG <- Z_pre[,1:4365] %*% M
df_pre$y_pre <- as.vector(ZG %*% alpha[1,] + ZE %*% gamma[1,] + res[1,]) # random effects + residual
Step 4: Use lm() to calculate the fixed-effect.
f <- lm(y_pre ~ -1 + df_pre$b1 + df_pre$b2 + df_pre$b3 + df_pre$b4 + df_pre$b5, data = df_pre)
fix_ef <- f$coefficients[1] * df_pre$b1 + f$coefficients[2] * df_pre$b2 + f$coefficients[3] * df_pre$b3 +
f$coefficients[4] * df_pre$b4 + f$coefficients[5] + df_pre$b5
Step 5: Generate response and reform into a dataframe for model-fitting.
response <- as.vector(fix_ef + df_pre$y_pre)
df_simu <- data.frame(id = id, response = response)
## Data smoothing
timefine <- seq(0,1,length=25) # dense time grid
y_hat <- matrix(0, 25, 873)
lambda <- rep(0, 873)
for (i in 1:873){
ss <- smooth.spline(time_rang, y_list[[i]], cv = FALSE)
y_hat[,i] <- predict(ss, timefine)$y
lambda[i] <- ss$lambda
}
matplot(timefine, y_hat, col = 1:873, type= "l") ### smoothed curves
## FPCA
fpcaobj <- prcomp(x=t(y_hat), retx = TRUE, center = TRUE, rank. = 4)
pcs <- fpcaobj$rotation # eigen vectors as basis functions for moedel-fitting
summary(pcs)
## PC1 PC2 PC3 PC4
## Min. :-0.31543 Min. :-0.31034 Min. :-0.42098 Min. :-0.54873
## 1st Qu.:-0.26880 1st Qu.:-0.08898 1st Qu.:-0.22864 1st Qu.:-0.12050
## Median :-0.17205 Median : 0.10105 Median :-0.04158 Median : 0.01714
## Mean :-0.17514 Mean : 0.05059 Mean :-0.08218 Mean :-0.00331
## 3rd Qu.:-0.09028 3rd Qu.: 0.23144 3rd Qu.: 0.08326 3rd Qu.: 0.13593
## Max. :-0.02831 Max. : 0.27498 Max. : 0.14811 Max. : 0.43943
pcs[,1] %*% pcs[,2]
## [,1]
## [1,] 9.367507e-17
## Combine the smoothed data and basis functions to a dataframe
subjectID <- rep(unique(df_simu$id), each=25)
response_test <- c(y_hat)
basis1 <- rep(pcs[,1], times = 873)
basis2 <- rep(pcs[,2], times = 873)
basis3 <- rep(pcs[,3], times = 873)
basis4 <- rep(pcs[,4], times = 873)
df_test <- data.frame(id = subjectID, y= response_test, phi1 = basis1, phi2 = basis2, phi3 = basis3, phi4 = basis4)
## Fit FMEM
fform <- y ~ -1 + df_test$phi1 + df_test$phi2 + df_test$phi3 +
(-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 | df_test$id) +
(-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 | df_test$id)
system.time(
ft <- fit_genetic_fmm(fform, df_test, A, pcs[,1:3])
) # user system elapsed
## 用户 系统 流逝
## 175.73 8.79 271.32
summary(ft)
## Linear mixed model fit by REML ['lmerMod']
##
## REML criterion at convergence: 176501.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8032 -0.5044 0.0076 0.5180 7.3512
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## df_test.id df_test$phi1 10698.65 103.434
## df_test$phi2 3978.43 63.075 -0.15
## df_test$phi3 3486.79 59.049 0.25 -0.09
## df_test.id.1 df_test$phi1 673.95 25.961
## df_test$phi2 82.06 9.059 -1.00
## df_test$phi3 277.50 16.658 1.00 -1.00
## Residual 127.08 11.273
## Number of obs: 21825, groups: df_test$id, 873
##
## Fixed effects:
## Estimate Std. Error t value
## df_test$phi1 38.464 11.049 3.481
## df_test$phi2 -21.163 6.729 -3.145
## df_test$phi3 20.376 6.323 3.222
##
## Correlation of Fixed Effects:
## df_t$1 df_t$2
## df_test$ph2 -0.156
## df_test$ph3 0.252 -0.092
# Extract covariance function
vc <- VarCorr(ft)
CG <- vc[["df_test.id"]] ## genetic covariance
CE <- vc[["df_test.id.1"]] ## environmental covariance
### Convert to genetic covariance function
CG_fun <- pcs[,1:3] %*% CG %*% t(pcs[,1:3])
### environmental covariance function
CE_fun <- pcs[,1:3] %*% CE %*% t(pcs[,1:3])
### Phenotypic covariance function
P_fun <- CG_fun + CE_fun
## Fit FMEM
fform1 <- y ~ -1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 +
(-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 | df_test$id) +
(-1 + df_test$phi1 + df_test$phi2 + df_test$phi3 + df_test$phi4 | df_test$id)
system.time(
ft1 <- fit_genetic_fmm(fform1, df_test, A, pcs)
) # user system elapsed
## 用户 系统 流逝
## 867.79 34.80 2420.16
summary(ft1)
## Linear mixed model fit by REML ['lmerMod']
##
## REML criterion at convergence: 161955.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.1931 -0.4539 0.0046 0.4551 4.7763
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## df_test.id df_test$phi1 10648.81 103.193
## df_test$phi2 4090.85 63.960 -0.15
## df_test$phi3 3483.33 59.020 0.23 -0.08
## df_test$phi4 985.57 31.394 0.04 -0.09 -0.09
## df_test.id.1 df_test$phi1 765.85 27.674
## df_test$phi2 84.21 9.176 -1.00
## df_test$phi3 347.04 18.629 1.00 -1.00
## df_test$phi4 8.22 2.867 -1.00 1.00 -1.00
## Residual 53.05 7.284
## Number of obs: 21825, groups: df_test$id, 873
##
## Fixed effects:
## Estimate Std. Error t value
## df_test$phi1 38.511 11.024 3.493
## df_test$phi2 -21.180 6.815 -3.108
## df_test$phi3 20.423 6.320 3.232
## df_test$phi4 -14.236 3.351 -4.248
##
## Correlation of Fixed Effects:
## df_t$1 df_t$2 df_t$3
## df_test$ph2 -0.151
## df_test$ph3 0.233 -0.082
## df_test$ph4 0.032 -0.087 -0.090
# Extract covariance function
vc1 <- VarCorr(ft1)
CG1 <- vc1[["df_test.id"]] ## genetic covariance
CE1 <- vc1[["df_test.id.1"]] ## environmental covariance
### Convert to genetic covariance function
CG_fun1 <- pcs %*% CG1 %*% t(pcs)
### environmental covariance function
CE_fun1 <- pcs %*% CE1 %*% t(pcs)
### Phenotypic covariance function
P_fun1 <- CG_fun1 + CE_fun1